For reference: ### NHANES 3
library(SAScii)
# nhanes3.tf <- tempfile()
daturl <- "https://wwwn.cdc.gov/nchs/data/nhanes3/1a/adult.dat"
code_url ="https://wwwn.cdc.gov/nchs/data/nhanes3/1a/adult.sas"
# Sas_code <- url(code_url)
# writeLines ( readLines(Sas_code) , con = nhanes3.tf )
# nhanes3.fwf.parameters <- parse.SAScii( nhanes3.tf , beginline = 5 )
# str( nhanes3.fwf.parameters )
# #-----
# 'data.frame': 90 obs. of 4 variables:
# $ varname: chr "SEQN" "HYK1A" "HYK1B" "HYK2A" ...
# $ width : num 5 1 1 2 2 2 2 4 4 2 ...
# $ char : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
# $ divisor: num 1 1 1 1 1 1 1 1 1 1 ...
# #------
daturl <- "https://wwwn.cdc.gov/nchs/data/nhanes3/1a/adult.dat"
in.nhanes3 <- read.fwf(daturl, widths=nhanes3.fwf.parameters$width,
col.names= nhanes3.fwf.parameters$varname)
in2 <- read.SAScii( daturl, code_url)
#write_csv(in2, "big_data/NHANES/nhanes_3/nhanes3.csv")
nhanes3_data <- read_csv("big_data/NHANES/nhanes_3/nhanes3.csv")
nhanes3_selected <- nhanes3_data %>%
select(SEQN,
DMPFSEQ,
HSAGEIR, # age in years
HAB1, # self-rated health: 1:excellent, very good, good, fair, 5: poor (get rid of 6 and 7)
HSSEX, # 1 male, 2 female
SDPPHASE, # 1 1988-1991, 2 1991-1994
HSDOIMO, # date of screener (month)
HSAGEU, # age unit
HSAITMOR # age in months at interview (screener)
) %>%
filter(HAB1 %in% 1:5) %>%
mutate(age = HSAGEIR,
sex = ifelse(HSSEX == 1, "Male", "Female"),
year = ifelse(SDPPHASE == 1, 1989.5, 1992.5),
srh = 6 - HAB1)
glimpse(nhanes3_selected)
#write_csv(nhanes3_selected, "data/nhanes3_selected.csv")
nhanes4_key <- read_csv("big_data/NHANES/nhanes_4/nhanes4_key.csv")
library(tidyverse)
library(haven)
# Assume nhanes4_key is loaded
# Separate keys by type of file (DEMO, HUQ) for simplicity
demo_key <- nhanes4_key %>% filter(str_detect(nhanes_file, "^DEMO"))
huq_key <- nhanes4_key %>% filter(str_detect(nhanes_file, "^HUQ"))
# A helper function to read and process a given domain of files
read_nhanes_domain <- function(key_table) {
# Get unique files for this domain
files <- key_table %>% distinct(nhanes_file)
domain_data <- files %>%
mutate(
data = map(nhanes_file, ~ {
vars_for_file <- key_table %>% filter(nhanes_file == .x)
needed_vars <- c("SEQN", unique(vars_for_file$nhanes_var))
file_path <- paste0("big_data/NHANES/nhanes_4/", .x, ".xpt")
# Read and select needed variables
df <- read_xpt(file_path) %>%
select(any_of(needed_vars)) %>%
# Rename nhanes_var to my_var
rename_with(
.fn = ~ vars_for_file$my_var[match(., vars_for_file$nhanes_var)],
.cols = vars_for_file$nhanes_var
) %>%
mutate(
nhanes_yr_1 = vars_for_file$nhanes_yr_1[1],
nhanes_yr_2 = vars_for_file$nhanes_yr_2[1]
)
df
})
) %>%
unnest(cols = data) # Unnest after renaming done
domain_data
}
# Read DEMO and HUQ data separately
demo_data <- read_nhanes_domain(demo_key)
huq_data <- read_nhanes_domain(huq_key)
# Now join demo and huq data by SEQN and cycle years.
# Note: If multiple cycles overlap, you may need to use both SEQN and nhanes_yr_1/nhanes_yr_2 as join keys.
# Typically SEQN is unique within a cycle, so joining on SEQN and year information might be prudent.
final_data <- demo_data %>%
full_join(huq_data, by = c("SEQN", "nhanes_yr_1", "nhanes_yr_2"))
# Now select the columns you need:
final_data <- final_data %>%
select(
SEQN,
age,
srh_huq010,
SDDSRVYR,
nhanes_yr_1,
nhanes_yr_2
)
glimpse(final_data)
nhanes4_selected <- final_data %>%
filter(srh_huq010 %in% 1:5) %>%
filter(age >= 18) %>%
mutate(srh = 6 - srh_huq010) %>%
mutate(year = (nhanes_yr_1 + nhanes_yr_2 ) / 2 ) %>%
mutate(cohort = year - age)
glimpse(nhanes4_selected)
write_csv(nhanes4_selected, "big_data/NHANES/nhanes_4/nhanes4_selected_apcsrh.csv")
write_csv(nhanes4_selected, "data/nhanes4_selected_apcsrh.csv")
nhanes3 <- read_csv("data/nhanes3_selected.csv") %>%
select(SEQN, age, year, cohort, srh)
## Rows: 20037 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): sex
## dbl (13): SEQN, DMPFSEQ, HSAGEIR, HAB1, HSSEX, SDPPHASE, HSDOIMO, HSAGEU, HS...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nhanes4 <- read_csv("data/nhanes4_selected_apcsrh.csv") %>%
select(SEQN, age, year, cohort, srh)
## Rows: 113188 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (9): SEQN, age, srh_huq010, SDDSRVYR, nhanes_yr_1, nhanes_yr_2, srh, yea...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_nhanes <- rbind(nhanes3, nhanes4)
data_nhanes <- data_nhanes %>%
na.omit() %>%
filter(age >= 18)
glimpse(data_nhanes)
## Rows: 117,441
## Columns: 5
## $ SEQN <dbl> 3, 4, 9, 10, 11, 19, 34, 44, 45, 48, 49, 51, 52, 53, 54, 55, 56…
## $ age <dbl> 21, 32, 48, 35, 48, 44, 42, 24, 67, 56, 82, 44, 50, 36, 19, 48,…
## $ year <dbl> 1989.5, 1989.5, 1989.5, 1989.5, 1989.5, 1989.5, 1989.5, 1989.5,…
## $ cohort <dbl> 1968.5, 1957.5, 1941.5, 1954.5, 1941.5, 1945.5, 1947.5, 1965.5,…
## $ srh <dbl> 5, 4, 4, 4, 2, 4, 5, 3, 4, 4, 3, 3, 5, 3, 3, 2, 4, 3, 2, 3, 4, …
table(data_nhanes$srh)
##
## 1 2 3 4 5
## 3968 17146 36570 29868 29889
table(data_nhanes$year)
##
## 1989.5 1992.5 1999.5 2001.5 2003.5 2005.5 2007.5 2009.5 2011.5 2013.5 2015.5
## 9890 9715 9942 11026 10114 10340 10146 10534 9754 10170 9961
## 2017.5
## 5849
table(data_nhanes$cohort)
##
## 1899.5 1900.5 1901.5 1902.5 1903.5 1904.5 1905.5 1906.5 1907.5 1908.5 1909.5
## 104 35 38 155 92 103 150 169 179 222 226
## 1910.5 1911.5 1912.5 1913.5 1914.5 1915.5 1916.5 1917.5 1918.5 1919.5 1920.5
## 174 201 240 170 324 233 484 350 554 447 603
## 1921.5 1922.5 1923.5 1924.5 1925.5 1926.5 1927.5 1928.5 1929.5 1930.5 1931.5
## 476 449 467 501 503 511 993 594 1041 564 1058
## 1932.5 1933.5 1934.5 1935.5 1936.5 1937.5 1938.5 1939.5 1940.5 1941.5 1942.5
## 734 986 782 1202 823 1336 933 1067 1007 1010 1152
## 1943.5 1944.5 1945.5 1946.5 1947.5 1948.5 1949.5 1950.5 1951.5 1952.5 1953.5
## 1195 1067 1195 1326 1502 1501 1573 1549 1676 1670 1858
## 1954.5 1955.5 1956.5 1957.5 1958.5 1959.5 1960.5 1961.5 1962.5 1963.5 1964.5
## 1978 2107 2278 2253 2366 2424 2495 2545 2598 2708 2665
## 1965.5 1966.5 1967.5 1968.5 1969.5 1970.5 1971.5 1972.5 1973.5 1974.5 1975.5
## 2602 2566 2703 2762 2891 2899 2939 2412 2416 2284 2173
## 1976.5 1977.5 1978.5 1979.5 1980.5 1981.5 1982.5 1983.5 1984.5 1985.5 1986.5
## 2078 2099 1864 1966 1870 1880 1435 1320 1377 1146 1012
## 1987.5 1988.5 1989.5 1990.5 1991.5 1992.5 1993.5 1994.5 1995.5 1996.5 1997.5
## 878 710 649 523 448 356 322 211 190 109 93
## 1998.5 1999.5
## 143 144
hist(data_nhanes$age)
hist(data_nhanes$year)
hist(data_nhanes$cohort)
hist(data_nhanes$srh)
data_nhanes %>%
mutate(age = cut(age, breaks = 6)) %>% # Create cohorts with 6 breaks
group_by(age, year) %>%
dplyr::summarize(mean_health = mean(srh, na.rm = TRUE), .groups = "drop") %>%
ggplot(aes(x = year, y = mean_health, color = age)) +
geom_line() +
geom_point() +
theme_minimal() +
labs(title = "Average SRH Per Year for Each Age Group",
subtitle = "NHANES III and IV Datasets",
x = "Year",
y = "Average SRH")
data_nhanes %>%
group_by(age, year) %>%
summarize(mean_health = mean(srh)) %>%
ggplot(aes(x = age, y = mean_health)) +
geom_line(color = "cornflowerblue") +
facet_wrap(~ year) +
labs(title = "Self-Rated Health By Age (Per Year)",
subtitle = "NHANES III and IV Datasets",
x = "Age of Respondent",
y = "Average SRH",
)
## `summarise()` has grouped output by 'age'. You can override using the `.groups`
## argument.
library(broom)
# Aggregate slopes
# Perform linear regression for each year and extract the coefficient of 'age' with confidence intervals, se, t stat, p val
lm_health_v_age_0 <- data_nhanes %>%
group_by(year) %>%
do(tidy(lm(srh ~ age, data = .), conf.int = TRUE)) %>% # Add conf.int = TRUE for CIs
filter(term == "age") %>%
select(year, coef = estimate, conf.low, conf.high, se = std.error, t_statistic = statistic, p_value = p.value)
# View the results with confidence intervals, se, t statistic, and p value
# print(lm_health_v_age_0)
knitr::kable(lm_health_v_age_0,
caption = "NHANES III and IV Datasets")
| year | coef | conf.low | conf.high | se | t_statistic | p_value |
|---|---|---|---|---|---|---|
| 1989.5 | -0.0128228 | -0.0138368 | -0.0118089 | 0.0005173 | -24.79012 | 0 |
| 1992.5 | -0.0115236 | -0.0125728 | -0.0104745 | 0.0005352 | -21.53062 | 0 |
| 1999.5 | -0.0166679 | -0.0179948 | -0.0153411 | 0.0006769 | -24.62424 | 0 |
| 2001.5 | -0.0157482 | -0.0169775 | -0.0145189 | 0.0006271 | -25.11140 | 0 |
| 2003.5 | -0.0170125 | -0.0182480 | -0.0157771 | 0.0006303 | -26.99218 | 0 |
| 2005.5 | -0.0139011 | -0.0151836 | -0.0126185 | 0.0006543 | -21.24532 | 0 |
| 2007.5 | -0.0188795 | -0.0201763 | -0.0175826 | 0.0006616 | -28.53685 | 0 |
| 2009.5 | -0.0151905 | -0.0164678 | -0.0139132 | 0.0006516 | -23.31204 | 0 |
| 2011.5 | -0.0169800 | -0.0183033 | -0.0156567 | 0.0006751 | -25.15261 | 0 |
| 2013.5 | -0.0150377 | -0.0163551 | -0.0137203 | 0.0006721 | -22.37523 | 0 |
| 2015.5 | -0.0170124 | -0.0183518 | -0.0156729 | 0.0006833 | -24.89712 | 0 |
| 2017.5 | -0.0103421 | -0.0117272 | -0.0089570 | 0.0007066 | -14.63698 | 0 |
# Plot coefficients
ggplot(lm_health_v_age_0, aes(x = year, y = coef)) +
geom_point() +
labs(
title = "Change in 'Age' Coefficient Over Years",
subtitle = "NHANES III and IV Datasets",
x = "Year",
y = "Coefficient of Age"
) +
theme_minimal()
## Regress the srh vs age coefficients from each year on the year of the survey
# Visualize
ggplot(lm_health_v_age_0, aes(x = year, y = coef)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE) + # Adds the regression line with standard error shading
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2) + # Confidence intervals for the coefficients
labs(
title = "Regression of 'Age' Coefficient Over Years",
subtitle = "NHANES III and IV Datasets",
x = "Year",
y = "Coefficient of Age"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Perform linear regression of 'coef' (age coefficient) vs 'year'
lm_coef_vs_year <- lm(coef ~ year, data = lm_health_v_age_0)
# View the summary of the regression
summary(lm_coef_vs_year)
##
## Call:
## lm(formula = coef ~ year, data = lm_health_v_age_0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0036777 -0.0016436 -0.0003809 0.0012305 0.0054258
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.847e-02 1.791e-01 0.550 0.595
## year -5.662e-05 8.929e-05 -0.634 0.540
##
## Residual standard error: 0.002595 on 10 degrees of freedom
## Multiple R-squared: 0.03866, Adjusted R-squared: -0.05748
## F-statistic: 0.4021 on 1 and 10 DF, p-value: 0.5402
# Self-Rated Health Category Distribution
data_nhanes %>%
mutate(health = srh) %>%
select(year, health) %>%
filter(!is.na(health)) %>%
mutate(
srh = factor(health,
levels = 1:5,
labels = c("Poor", "Fair", "Good", "Very Good", "Excellent"))) %>%
# Remove missing values
# Calculate percentages by year
group_by(year) %>%
count(srh) %>%
mutate(percent = n / sum(n) * 100) %>%
ungroup() %>%
ggplot(aes(x = year, y = percent, color = srh)) +
geom_line() +
geom_point() +
labs(title = "Self-Rated Health Category Distribution",
subtitle = "GSS Dataset",
y = "Percent",
x = "Year") +
theme_minimal()
# Self-Rated Health Category Distribution by Age Group
data_nhanes %>%
mutate(age_group = cut(age, breaks = 6)) %>% # Create cohorts with 6 breaks
mutate(health = srh) %>%
select(year, health, age_group) %>%
filter(!is.na(health)) %>%
mutate(
srh = factor(health,
levels = 1:5,
labels = c("Poor", "Fair", "Good", "Very Good", "Excellent"))) %>%
# Remove missing values
# Calculate percentages by year
group_by(year, age_group) %>%
count(srh) %>%
mutate(percent = n / sum(n) * 100) %>%
ungroup() %>%
ggplot(aes(x = year, y = percent, color = srh)) +
geom_line() +
geom_point() +
labs(title = "Self-Rated Health Category Distribution",
subtitle = "GSS Dataset",
y = "Percent",
x = "Year") +
facet_wrap(~age_group) +
theme_minimal()
Maybe the different NHANES studies aren’t comparable – let’s try only NHANES 4
nhanes4 <- nhanes4 %>%
filter(age >= 18) %>%
na.omit()
nhanes4 %>%
mutate(age = cut(age, breaks = 6)) %>% # Create cohorts with 6 breaks
group_by(age, year) %>%
dplyr::summarize(mean_health = mean(srh, na.rm = TRUE), .groups = "drop") %>%
ggplot(aes(x = year, y = mean_health, color = age)) +
geom_line() +
geom_point() +
labs(title = "Average SRH Per Year for Each Age Group",
subtitle = "NHANES IV",
x = "Year",
y = "Average SRH")
nhanes4 %>%
group_by(age, year) %>%
summarize(mean_health = mean(srh)) %>%
ggplot(aes(x = age, y = mean_health)) +
geom_line(color = "cornflowerblue") +
geom_smooth() +
facet_wrap(~ year) +
labs(title = "Self-Rated Health By Age (Per Year)",
subtitle = "NHANES IV Dataset",
x = "Age of Respondent",
y = "Average SRH",
)
## `summarise()` has grouped output by 'age'. You can override using the `.groups`
## argument.
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
nhanes4 %>%
group_by(age, year) %>%
summarize(mean_health = mean(srh)) %>%
ggplot(aes(x = age, y = mean_health)) +
geom_line(color = "cornflowerblue") +
geom_smooth() +
facet_wrap(~ year) +
labs(title = "Self-Rated Health By Age (Per Year)",
subtitle = "NHANES IV Dataset",
x = "Age of Respondent",
y = "Average SRH",
)
## `summarise()` has grouped output by 'age'. You can override using the `.groups`
## argument.
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
nhanes4 %>%
group_by(age, year) %>%
summarize(mean_health = mean(srh)) %>%
ggplot(aes(x = age, y = mean_health)) +
geom_line(color = "cornflowerblue") +
geom_smooth(method = "lm") +
facet_wrap(~ year) +
labs(title = "Self-Rated Health By Age (Per Year)",
subtitle = "NHANES IV Dataset",
x = "Age of Respondent",
y = "Average SRH",
)
## `summarise()` has grouped output by 'age'. You can override using the `.groups`
## argument.
## `geom_smooth()` using formula = 'y ~ x'
library(broom)
# Aggregate slopes
# Perform linear regression for each year and extract the coefficient of 'age' with confidence intervals, se, t stat, p val
lm_health_v_age_0 <- nhanes4 %>%
group_by(year) %>%
do(tidy(lm(srh ~ age, data = .), conf.int = TRUE)) %>% # Add conf.int = TRUE for CIs
filter(term == "age") %>%
select(year, coef = estimate, conf.low, conf.high, se = std.error, t_statistic = statistic, p_value = p.value)
# View the results with confidence intervals, se, t statistic, and p value
# print(lm_health_v_age_0)
knitr::kable(lm_health_v_age_0,
caption = "NHANES IV")
| year | coef | conf.low | conf.high | se | t_statistic | p_value |
|---|---|---|---|---|---|---|
| 1999.5 | -0.0166679 | -0.0179948 | -0.0153411 | 0.0006769 | -24.62424 | 0 |
| 2001.5 | -0.0157482 | -0.0169775 | -0.0145189 | 0.0006271 | -25.11140 | 0 |
| 2003.5 | -0.0170125 | -0.0182480 | -0.0157771 | 0.0006303 | -26.99218 | 0 |
| 2005.5 | -0.0139011 | -0.0151836 | -0.0126185 | 0.0006543 | -21.24532 | 0 |
| 2007.5 | -0.0188795 | -0.0201763 | -0.0175826 | 0.0006616 | -28.53685 | 0 |
| 2009.5 | -0.0151905 | -0.0164678 | -0.0139132 | 0.0006516 | -23.31204 | 0 |
| 2011.5 | -0.0169800 | -0.0183033 | -0.0156567 | 0.0006751 | -25.15261 | 0 |
| 2013.5 | -0.0150377 | -0.0163551 | -0.0137203 | 0.0006721 | -22.37523 | 0 |
| 2015.5 | -0.0170124 | -0.0183518 | -0.0156729 | 0.0006833 | -24.89712 | 0 |
| 2017.5 | -0.0103421 | -0.0117272 | -0.0089570 | 0.0007066 | -14.63698 | 0 |
# Plot coefficients
ggplot(lm_health_v_age_0, aes(x = year, y = coef)) +
geom_point() +
labs(
title = "Change in 'Age' Coefficient Over Years",
subtitle = "NHANES IV Dataset",
x = "Year",
y = "Coefficient of Age"
) +
theme_minimal()
## Regress the srh vs age coefficients from each year on the year of the survey
# Visualize
ggplot(lm_health_v_age_0, aes(x = year, y = coef)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE) + # Adds the regression line with standard error shading
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2) + # Confidence intervals for the coefficients
labs(
title = "Regression of 'Age' Coefficient Over Years",
subtitle = "NHANES IV Dataset",
x = "Year",
y = "Coefficient of Age"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Perform linear regression of 'coef' (age coefficient) vs 'year'
lm_coef_vs_year <- lm(coef ~ year, data = lm_health_v_age_0)
# View the summary of the regression
summary(lm_coef_vs_year)
##
## Call:
## lm(formula = coef ~ year, data = lm_health_v_age_0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0030434 -0.0014698 0.0000866 0.0008902 0.0039057
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.3346611 0.2485994 -1.346 0.215
## year 0.0001588 0.0001238 1.283 0.235
##
## Residual standard error: 0.002248 on 8 degrees of freedom
## Multiple R-squared: 0.1707, Adjusted R-squared: 0.06701
## F-statistic: 1.646 on 1 and 8 DF, p-value: 0.2354
# Self-Rated Health Category Distribution
nhanes4 %>%
mutate(health = srh) %>%
select(year, health) %>%
filter(!is.na(health)) %>%
mutate(
srh = factor(health,
levels = 1:5,
labels = c("Poor", "Fair", "Good", "Very Good", "Excellent"))) %>%
# Remove missing values
# Calculate percentages by year
group_by(year) %>%
count(srh) %>%
mutate(percent = n / sum(n) * 100) %>%
ungroup() %>%
ggplot(aes(x = year, y = percent, color = srh)) +
geom_line() +
geom_point() +
labs(title = "Self-Rated Health Category Distribution",
subtitle = "GSS Dataset",
y = "Percent",
x = "Year") +
theme_minimal()
# Self-Rated Health Category Distribution by Age Group
nhanes4 %>%
mutate(age_group = cut(age, breaks = 6)) %>% # Create cohorts with 6 breaks
mutate(health = srh) %>%
select(year, health, age_group) %>%
filter(!is.na(health)) %>%
mutate(
srh = factor(health,
levels = 1:5,
labels = c("Poor", "Fair", "Good", "Very Good", "Excellent"))) %>%
# Remove missing values
# Calculate percentages by year
group_by(year, age_group) %>%
count(srh) %>%
mutate(percent = n / sum(n) * 100) %>%
ungroup() %>%
ggplot(aes(x = year, y = percent, color = srh)) +
geom_line() +
geom_point() +
labs(title = "Self-Rated Health Category Distribution",
subtitle = "GSS Dataset",
y = "Percent",
x = "Year") +
facet_wrap(~age_group) +
theme_minimal()